Supplement B: Fitting Models
1 Overview
Goal: model the spatial distribution of ranch clusters and populations.
Data: Dr. Macfarlan’s ethnographic data, along with ecological and socio-economic variables.
Method: binomial GLM for occupied vs abandoned, negative binomial for population
We want to know what motivates market-integrated, subsistence ranchers to live where they do. This is framed as a trade-off between market integration and ecology.
2 R Preamble
library(MASS)
library(broom)
library(car)
library(here)
library(gt)
library(patchwork)
library(sf)
library(sfdep)
library(terra)
library(tidyterra)
library(tidyverse)
library(viridis)Specify consistent plot theme here:
Code
theme_set(theme_bw(12))
theme_update(
panel.grid = element_blank(),
plot.margin = margin(2,2,2,2),
strip.background = element_blank(),
strip.text = element_text(hjust = 0, size = rel(1.1))
)And some defaults for table formatting:
Code
rename_models <- function(x){
x |>
mutate(
model = case_when(
model == "occupied" ~ "Occupied-Abandoned",
model == "population" ~ "Population",
model == "secure" ~ "Population [+ secure]",
model == "securex" ~ "Population [x secure]",
TRUE ~ NA
)
)
}
format_double <- function(x){
x |> mutate(across(where(is.double), \(.x){ round(.x, digits = 3) }))
}
model_table <- function(x){
x |>
gt() |>
tab_options(
column_labels.font.weight = "bold",
data_row.padding = px(6),
heading.align = "left",
heading.border.bottom.style = "hidden",
row.striping.background_color = "gray96",
row.striping.include_stub = FALSE,
# row.striping.include_table_body = TRUE,
table.align = "left",
table.border.top.style = "hidden",
table.border.bottom.style = "hidden"
) |>
sub_missing(missing_text = "")
}3 Data
3.1 Geopackage
Specify path to geopackage database holding all spatial vector data.
gpkg <- here("data", "choyero.gpkg")3.2 Dependent variables
- Occupied vs Abandoned ranches
- Population size of occupied ranches
Rancheros live in “ranch clusters” occupied by close kin.
ranches <- read_sf(gpkg, layer = "ranches")
clusters <- read_sf(gpkg, layer = "clusters")3.3 Independent variables
- Market-integration: cost-distance (sort of) to La Paz and Constitucion (hours)
- Ecology: cost-distance to springs (hours)
Cost-distance to La Paz/Ciudad Constitucion provides a proxy measure of access to markets where rancheros sell livestock and buy food and equipment. Cost-distance to springs represents habitat suitability for small-scale pastoralist economies. Measuring it is a complicated matter, as property rights are involved and multiple springs are used. To avoid some of that complexity, we take the average travel time from a ranch cluster to its two nearest springs. So, this is our basic trade-off between market-integration and local ecology. For further details about how we estimate this variable, see “R/leastcostpaths.html.”
path_to_cities <- read_sf(gpkg, layer = "path_to_cities")
path_to_springs <- read_sf(gpkg, layer = "path_to_springs")
security <- ranches |>
st_drop_geometry() |>
select(cluster, land) |>
mutate(secure = ifelse(land == "secure", 1, 0)) |>
group_by(cluster) |>
summarize(secure = sum(secure)/n())
clusters <-
clusters |>
mutate(
springs = path_to_springs$cost,
cities = path_to_cities$cost,
paz = path_to_cities$time_to_paz,
secure = security$secure
)
remove(path_to_cities, path_to_springs, security, ranches)3.4 Topography
For additional context, here’s what the terrain looks like.
elevation <- here("data", "dem_30m.tif") |> rast()
elevation_stats <- global(
elevation,
fun = c("min", "max", "mean", "sd"),
na.rm = TRUE
)Code
elevation_stats |>
pivot_longer(
everything(),
names_to = "statistic",
values_to = "value"
) |>
gt() |>
tab_header("Elevation") |>
fmt_number(-statistic, decimals = 1) |>
cols_width(statistic ~ px(125)) |>
tab_options(
column_labels.hidden = TRUE,
data_row.padding = px(6),
heading.align = "left",
heading.border.bottom.style = "hidden",
row.striping.background_color = "gray96",
row.striping.include_stub = FALSE,
row.striping.include_table_body = TRUE,
table.align = "left",
table.border.top.style = "hidden",
table.border.bottom.style = "hidden"
)| Elevation | |
|---|---|
| min | 0.0 |
| max | 979.6 |
| mean | 426.9 |
| sd | 142.3 |
Code
hill <- shade(
terrain(elevation, "slope", unit = "radians"),
terrain(elevation, "aspect", unit = "radians"),
angle = 30,
direction = seq(45, 360, by = 45)
)
hill <- setValues(hill, scales::rescale(values(hill), to = c(1, 1000)))
hill <- round(hill)
ggplot() +
geom_spatraster(
data = hill,
fill = hcl.colors(1000, "Grays")[values(hill)],
alpha = 0.75,
maxcell = Inf
) +
geom_spatraster(data = elevation, alpha = 0.5) +
scale_fill_hypso_c(
name = "Elevation (m)",
palette = "utah_1",
limits = elevation_stats[, c("min", "max")] |> t() |> c(),
alpha = 0.5,
na.value = "transparent"
) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.position = "none"
)3.5 Data Summary
data_summary <- function(x, .m, .v){
.f <- list(min = min, max = max, mean = mean, sd = sd)
x |>
st_drop_geometry() |>
mutate(model = .m) |>
group_by(model) |>
summarize(across({{ .v }}, .fns = .f)) |>
pivot_longer(
-model,
names_sep = "_",
names_to = c("variable", "statistic")
) |>
pivot_wider(
names_from = statistic,
values_from = value
)
}
variable_summary <- bind_rows(
clusters |> data_summary("Binomial", c(occupied, springs, paz)),
clusters |> filter(population > 0) |> data_summary("Poisson", c(population, springs, cities, secure))
)
remove(data_summary)
write_csv(variable_summary, here("data", "variable-summary.csv"))
variable_summary |>
group_by(model) |>
gt() |>
fmt_number(decimals = 2) |>
tab_options(
column_labels.font.weight = "bold",
data_row.padding = px(6),
heading.align = "left",
heading.border.bottom.style = "hidden",
row_group.as_column = TRUE,
table.border.top.style = "hidden",
table.border.bottom.style = "hidden"
) |>
sub_missing(missing_text = "")| variable | min | max | mean | sd | |
|---|---|---|---|---|---|
| Binomial | occupied | 0.00 | 1.00 | 0.46 | 0.50 |
| springs | 0.00 | 1.28 | 0.23 | 0.27 | |
| paz | 0.13 | 3.18 | 1.11 | 0.71 | |
| Poisson | population | 1.00 | 32.00 | 8.79 | 8.11 |
| springs | 0.00 | 0.68 | 0.11 | 0.18 | |
| cities | 0.22 | 2.68 | 1.08 | 0.63 | |
| secure | 0.00 | 1.00 | 0.30 | 0.43 |
4 Analysis
4.1 Models
In addition to a binomial model of the status of each ranch cluster (occupied vs abandoned), we fit a negative binomial model of the population count of occupied ranch clusters. The negative binomial is an alternative to standard Poisson models where there is an expectation of dispersion in the residuals. The Poisson model assumes that the variance is equal to the mean, but this is not always the case.
models <- list(
occupied = glm(
occupied ~ paz + springs,
family = binomial,
data = clusters
),
population = glm.nb(
population ~ cities + springs,
data = clusters |> filter(population > 0)
),
secure = glm.nb(
population ~ (cities + springs) + secure,
data = clusters |> filter(population > 0)
),
securex = glm.nb(
population ~ (cities + springs) * secure,
data = clusters |> filter(population > 0)
)
)4.2 ANOVA (LRT)
We conduct a Likelihood Ratio Test of the population models to see if adding land security makes a significant improvement to the model.
lrt <- with(models, anova(population, secure, securex, test = "LRT"))4.3 Residual Autocorrelation
We use Monte Carlo simulations of Moran’s I to test for spatial autocorrelation in the untransformed residuals of each model.
get_moran <- function(x, .sf){
is_nb <- grepl("Negative Binomial", x$family[["family"]])
if (is_nb) .sf <- .sf |> filter(occupied == 1)
geometry <- st_geometry(.sf)
neighbors <- st_knn(geometry, k = nrow(.sf)-1)
weights <- st_inverse_distance(neighbors, geometry)
moran <- global_moran_perm(residuals(x), neighbors, weights)
moran |>
tidy() |>
mutate(variable = "residuals", .before = 1)
}
morans_i <- models |>
map(\(.x){ get_moran(.x, clusters) }) |>
bind_rows(.id = "model")4.4 Variance Inflation
A model of variance inflation tests for multi-collinearity in the predictors.
vif <- models |>
map(car::vif) |>
bind_rows(.id = "model")5 Results
5.1 Model evaluation
Code
model_statistics <- models |>
map(glance) |>
# glance returns logLik column as numeric for glm and 'logLik" for negbin
map(\(.x){ mutate(.x, across(everything(), as.numeric)) }) |>
bind_rows(.id = "model") |>
format_double() |>
rename_models() |>
rename("n.obs" = nobs) |>
relocate(model)
model_statistics |>
model_table() |>
tab_header(title = "Model Statistics")| Model Statistics | ||||||||
|---|---|---|---|---|---|---|---|---|
| model | null.deviance | df.null | logLik | AIC | BIC | deviance | df.residual | n.obs |
| Occupied-Abandoned | 99.313 | 71 | -30.488 | 66.975 | 73.805 | 60.975 | 69 | 72 |
| Population | 48.629 | 32 | -97.659 | 203.319 | 209.305 | 32.737 | 30 | 33 |
| Population [+ secure] | 48.936 | 32 | -97.577 | 205.153 | 212.636 | 32.775 | 29 | 33 |
| Population [x secure] | 50.399 | 32 | -97.166 | 208.333 | 218.808 | 32.902 | 27 | 33 |
Code
coefficient_estimates <- models |>
map(tidy) |>
bind_rows(.id = "model") |>
format_double() |>
rename_models() |>
rename("z value" = statistic) |>
rename_with(gsub, pattern = "\\.", replacement = " ")
coefficient_estimates |>
group_by(model) |>
model_table() |>
tab_options(row_group.as_column = TRUE) |>
tab_stubhead("model") |>
tab_header(title = "Coefficient Estimates")| Coefficient Estimates | |||||
|---|---|---|---|---|---|
| model | term | estimate | std error | z value | p value |
| Occupied-Abandoned | (Intercept) | -2.489 | 0.836 | -2.977 | 0.003 |
| paz | 2.797 | 0.757 | 3.695 | 0.000 | |
| springs | -4.803 | 1.828 | -2.628 | 0.009 | |
| Population | (Intercept) | 2.844 | 0.238 | 11.939 | 0.000 |
| cities | -0.802 | 0.214 | -3.753 | 0.000 | |
| springs | 0.699 | 0.673 | 1.039 | 0.299 | |
| Population [+ secure] | (Intercept) | 2.883 | 0.265 | 10.877 | 0.000 |
| cities | -0.804 | 0.214 | -3.759 | 0.000 | |
| springs | 0.677 | 0.674 | 1.004 | 0.315 | |
| secure | -0.114 | 0.284 | -0.403 | 0.687 | |
| Population [x secure] | (Intercept) | 3.011 | 0.301 | 9.999 | 0.000 |
| cities | -0.919 | 0.256 | -3.590 | 0.000 | |
| springs | 0.539 | 0.742 | 0.726 | 0.468 | |
| secure | -0.548 | 0.540 | -1.014 | 0.310 | |
| cities:secure | 0.489 | 0.611 | 0.800 | 0.424 | |
| springs:secure | -0.164 | 2.013 | -0.081 | 0.935 | |
Code
lrt <- lrt |>
as_tibble() |>
rename_with(tolower) |>
format_double() |>
mutate(
model = case_when(
model == "cities + springs" ~ "Population",
model == "(cities + springs) + secure" ~ "Population [+ secure]",
model == "(cities + springs) * secure" ~ "Population [x secure]",
TRUE ~ NA
)
)
lrt |>
model_table() |>
tab_header(
title = "Likelihood Ratio Test",
subtitle = "Alternative: difference in log-likelihood greater than zero"
) |>
cols_width(model ~ pct(21.5))| Likelihood Ratio Test | |||||||
|---|---|---|---|---|---|---|---|
| Alternative: difference in log-likelihood greater than zero | |||||||
| model | theta | resid. df | 2 x log-lik. | test | df | lr stat. | pr(chi) |
| Population | 2.977 | 30 | -195.319 | ||||
| Population [+ secure] | 3.003 | 29 | -195.153 | 1 vs 2 | 1 | 0.166 | 0.684 |
| Population [x secure] | 3.130 | 27 | -194.333 | 2 vs 3 | 2 | 0.820 | 0.663 |
Code
morans_i <- morans_i |>
format_double() |>
rename_models() |>
rename("rank" = parameter)
morans_i |>
select(model, statistic, rank, p.value) |>
model_table() |>
tab_header(
title = "Monte Carlo Simulations of Moran's I",
subtitle = "Alternative: spatial autocorrelation (two-sided)"
)| Monte Carlo Simulations of Moran's I | |||
|---|---|---|---|
| Alternative: spatial autocorrelation (two-sided) | |||
| model | statistic | rank | p.value |
| Occupied-Abandoned | -0.023 | 200 | 0.800 |
| Population | -0.082 | 84 | 0.336 |
| Population [+ secure] | -0.085 | 104 | 0.416 |
| Population [x secure] | -0.080 | 89 | 0.356 |
Code
vif <- vif |>
format_double() |>
rename_models()
vif |>
model_table() |>
tab_header(
title = "Variance Inflation"
) |>
cols_width(model ~ pct(21.5))| Variance Inflation | ||||||
|---|---|---|---|---|---|---|
| model | paz | springs | cities | secure | cities:secure | springs:secure |
| Occupied-Abandoned | 1.357 | 1.357 | ||||
| Population | 1.071 | 1.071 | ||||
| Population [+ secure] | 1.079 | 1.082 | 1.029 | |||
| Population [x secure] | 1.334 | 1.567 | 3.832 | 5.976 | 2.558 | |
High VIF scores for interaction terms can be ignored.
5.2 Marginal Response
Here we calculate the marginal response of each response variable relative to change in each focal variable while holding all non-focal variables at their mean.
# to make the lines smoother when plotting
densify <- function(x, n = 300) seq(min(x), max(x), length = n)
template <- with(
clusters,
tibble(
paz = densify(paz),
cities = densify(cities),
springs = densify(springs)
)
)
estimate <- function(model, newdata){
est <- predict(
model,
newdata = newdata,
se.fit = TRUE
)
inv_link <- family(model)$linkinv
est <- with(
est,
tibble(
y = inv_link(fit),
hi = inv_link(fit + 2*se.fit),
lo = inv_link(fit - 2*se.fit)
)
)
bind_cols(newdata, est)
}
responses <- tibble(
model = c("occupied", "occupied", "population", "population"),
variable = c("paz", "springs", "cities", "springs")
) |>
mutate(
model_object = models[model],
data = list(template),
data = map2(data, variable, \(.x, .y){ .x |> mutate(across(-all_of(.y), mean)) }),
estimate = map2(model_object, data, estimate),
estimate = map2(estimate, variable, \(.x, .y){ rename(.x, "x" = all_of(.y)) })
) |>
select(model, variable, estimate) |>
unnest(estimate) |>
select(model, variable, x, y, hi, lo)Code
# add old data to display points
observations <- map(models, \(.x){ .x$model })
observations <- tibble(
model = c("occupied", "occupied", "population", "population"),
variable = c("paz", "springs", "cities", "springs")
) |>
mutate(
old_data = observations[model],
old_data = map2(old_data, variable, \(.x, .y){ rename(.x, "x" = all_of(.y)) }),
old_data = map2(old_data, model, \(.x, .y){ rename(.x, "y" = all_of(.y)) })
) |>
unnest(old_data) |>
select(model, variable, x, y)
baja_labels <- c(
"paz" = "To La Paz",
"springs" = "To Springs",
"cities" = "To Cities"
)
gg_occupied <- ggplot() +
geom_ribbon(
data = responses |> filter(model == "occupied"),
aes(x, ymin = lo, ymax = hi),
fill = "#F5F5F5",
color = "transparent"
) +
geom_line(
data = responses |> filter(model == "occupied"),
aes(x, y),
linewidth = 0.7
) +
geom_point(
data = observations |> filter(model == "occupied"),
aes(x, y),
size = 2.5,
color = alpha("#2E4756", 0.7)
) +
labs(
x = "Hours",
y = "Abandoned-Occupied"
) +
facet_wrap(
vars(variable),
scales = "free_x",
labeller = labeller(variable = baja_labels)
) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank()
)
gg_population <- ggplot() +
geom_ribbon(
data = responses |> filter(model == "population"),
aes(x, ymin = lo, ymax = hi),
fill = "#F5F5F5",
color = "transparent"
) +
geom_line(
data = responses |> filter(model == "population"),
aes(x, y),
linewidth = 0.7
) +
geom_point(
data = observations |> filter(model == "population"),
aes(x, y),
size = 2.5,
color = alpha("#2E4756", 0.7)
) +
labs(
x = "Hours",
y = "Population"
) +
facet_wrap(
vars(variable),
scales = "free_x",
labeller = labeller(variable = baja_labels)
)
gg <- gg_occupied/gg_population
ggsave(
plot = gg,
here("figures", "response-plots.png"),
dpi = 600,
width = 5.75,
height = 5.75
)6 Session Info
Code
# save the session info as an object
pkg_sesh <- sessioninfo::session_info(pkgs = "attached")
# inject the quarto info
pkg_sesh$platform$quarto <- paste(
quarto::quarto_version(),
"@",
quarto::quarto_path()
)
# print it out
pkg_sesh─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.1 (2023-06-16 ucrt)
os Windows 11 x64 (build 22621)
system x86_64, mingw32
ui RTerm
language (EN)
collate English_United States.utf8
ctype English_United States.utf8
tz America/Denver
date 2024-01-23
pandoc 3.1.1 @ C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)
quarto 1.4.545 @ C:\\PROGRA~1\\Quarto\\bin\\quarto.exe
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
broom * 1.0.5 2023-06-09 [1] CRAN (R 4.3.1)
car * 3.1-2 2023-03-30 [1] CRAN (R 4.3.1)
carData * 3.0-5 2022-01-06 [1] CRAN (R 4.3.1)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.1)
ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.3.2)
gt * 0.10.1 2024-01-17 [1] CRAN (R 4.3.2)
here * 1.0.1 2020-12-13 [1] CRAN (R 4.3.1)
lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)
MASS * 7.3-60 2023-05-04 [2] CRAN (R 4.3.1)
patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.3.2)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.1)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)
sf * 1.0-15 2023-12-18 [1] CRAN (R 4.3.2)
sfdep * 0.2.3 2023-01-11 [1] CRAN (R 4.3.1)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
terra * 1.7-65 2023-12-15 [1] CRAN (R 4.3.1)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.1)
tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.3.1)
tidyterra * 0.5.2 2024-01-19 [1] CRAN (R 4.3.1)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.1)
viridis * 0.6.4 2023-07-22 [1] CRAN (R 4.3.1)
viridisLite * 0.4.2 2023-05-02 [1] CRAN (R 4.3.1)
[1] C:/Users/kenne/AppData/Local/R/win-library/4.3
[2] C:/Program Files/R/R-4.3.1/library
──────────────────────────────────────────────────────────────────────────────